library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(ggrepel)

Lecture 9

9.1 Adding general details

We have learned the fundamental functionalities of ggplot. Let’s refine the plots.

setwd("~/Documents/ibs_course/BUS240/data")
load("asasec.rda")
head(asasec)
##                                Section         Sname Beginning Revenues
## 1      Aging and the Life Course (018)         Aging     12752    12104
## 2     Alcohol, Drugs and Tobacco (030) Alcohol/Drugs     11933     1144
## 3 Altruism and Social Solidarity (047)      Altruism      1139     1862
## 4            Animals and Society (042)       Animals       473      820
## 5             Asia/Asian America (024)          Asia      9056     2116
## 6            Body and Embodiment (048)          Body      3408     1618
##   Expenses Ending Journal Year Members
## 1    12007  12849      No 2005     598
## 2      400  12677      No 2005     301
## 3     1875   1126      No 2005      NA
## 4     1116    177      No 2005     209
## 5     1710   9462      No 2005     365
## 6     1920   3106      No 2005      NA

This is some data on membership over time in special-interest sections of the American Sociological Association.

In this dataset, we have membership data for each section over a ten year period.

Let’s look at the relationship between section membership and section revenues for a single year, 2014.

p <- ggplot(data = subset(asasec, Year == 2014),
            mapping = aes(x = Members, y = Revenues, label = Sname))
p + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

This is our basic scatterplot-and-smoother graph. The defalut geom_smooth method is loess (local weighted regression).

To refine it, let’s begin by identifying some outliers, switch from loess to OLS, and introduce a third variable, Journal, into the aes() which is inside geom_point

unique(asasec$Journal)
## [1] No  Yes
## Levels: No Yes
p <- ggplot(data = subset(asasec, Year == 2014),
            mapping = aes(x = Members, y = Revenues, label = Sname))

p + geom_point(mapping = aes(color = Journal)) +
    geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'

Now we can add some text labels. Let’s build it additively.

p0 <- ggplot(data = subset(asasec, Year == 2014),
             mapping = aes(x = Members, y = Revenues, label = Sname))

p1 <- p0 + geom_smooth(method = "lm", se = FALSE, color = "gray80") +
    geom_point(mapping = aes(color = Journal)) 
p1
## `geom_smooth()` using formula 'y ~ x'

p2 <- p1 + geom_text_repel(data=subset(asasec,
                                       Year == 2014 & Revenues > 7000),
                           size = 2)
p2
## `geom_smooth()` using formula 'y ~ x'

Continuing with the p2 object still, we can label the axes and scales. We also add a title and move the legend to make better use of the space in the plot.

p3 <- p2 + labs(x="Membership",
        y="Revenues",
        color = "Section has own Journal",
        title = "ASA Sections",
        subtitle = "2014 Calendar year.",
        caption = "Source: ASA annual report.")
p3
## `geom_smooth()` using formula 'y ~ x'

p4 <- p3 + scale_y_continuous(labels = scales::dollar) +
     theme(legend.position = "bottom")
p4
## `geom_smooth()` using formula 'y ~ x'

9.2 Color

Consider: ordered vs. unordered categorical variable. What’s the examples?

There are many ways to color your figures, so make sure that the following example is just one of them. We use the RColorBrewer package to make a wide range of named color palettes available. Together with ggplot, you access these colors by specifying the scale_color_brewer() or scale_fill_brewer() functions, depending on the aesthetic you are mapping.

Let’s use organdata we used before.

setwd("~/Documents/ibs_course/BUS240/data")
load("organdata.rda")
head(organdata)
## # A tibble: 6 × 21
##   country   year       donors   pop pop_d…¹   gdp gdp_lag health healt…² pubhe…³
##   <chr>     <date>      <dbl> <int>   <dbl> <int>   <int>  <dbl>   <dbl>   <dbl>
## 1 Australia NA           NA   17065   0.220 16774   16591   1300    1224     4.8
## 2 Australia 1991-01-01   12.1 17284   0.223 17171   16774   1379    1300     5.4
## 3 Australia 1992-01-01   12.4 17495   0.226 17914   17171   1455    1379     5.4
## 4 Australia 1993-01-01   12.5 17667   0.228 18883   17914   1540    1455     5.4
## 5 Australia 1994-01-01   10.2 17855   0.231 19849   18883   1626    1540     5.4
## 6 Australia 1995-01-01   10.2 18072   0.233 21079   19849   1737    1626     5.5
## # … with 11 more variables: roads <dbl>, cerebvas <int>, assault <int>,
## #   external <int>, txp_pop <dbl>, world <chr>, opt <chr>, consent_law <chr>,
## #   consent_practice <chr>, consistent <chr>, ccode <chr>, and abbreviated
## #   variable names ¹​pop_dens, ²​health_lag, ³​pubhealth
## # ℹ Use `colnames()` to see all variable names
p <- ggplot(data = organdata,
            mapping = aes(x = roads, y = donors, color = world))
p + geom_point(size = 2)
## Warning: Removed 34 rows containing missing values (geom_point).

p + geom_point(size = 2) + scale_color_brewer(palette = "Set2") 
## Warning: Removed 46 rows containing missing values (geom_point).

p + geom_point(size = 2) + scale_color_brewer(palette = "Set2") +
  theme(legend.position = "top")
## Warning: Removed 46 rows containing missing values (geom_point).

p + geom_point(size = 2) + scale_color_brewer(palette = "Pastel2") +
  theme(legend.position = "top")
## Warning: Removed 46 rows containing missing values (geom_point).

p + geom_point(size = 2) + scale_color_brewer(palette = "Dark2") +
  theme(legend.position = "top")
## Warning: Removed 46 rows containing missing values (geom_point).

You can also specify colors manually, via scale_color_manual() or scale_fill_manual(). These functions take a value argument that can be specified as vector of color names or color values that R knows about.

R knows many color names (like red, and green, and cornflowerblue. Try demo(‘colors’) for an overview.

Alternatively, color values can be specified via their hexadecimal RGB value. This is a way of encoding color values in the RGB colorspace, where each channel can take a value from 0 to 255 like this. A color hex value begins with a hash or pound character, #, followed by three pairs of hexadecimal or “hex” numbers. Hex values are in Base 16, with the first six letters of the alphabet standing for the numbers 10 to 15. This allows a two-character hex number to range from 0 to 255. You read them as #rrggbb, where rr is the two-digit hex code for the red channel, gg for the green channel, and bb for the blue channel. So #CC55DD translates in decimal to CC = 204 (red), 55 = 85 (green), and DD = 221 (blue). It gives a strong pink color.

Going back to our ASA Membership plot, for example, we can manually introduce a palette that’s friendly to color-blind viewers.

cb_palette <- c("#999999", "#E69F00", "#56B4E9", "#009E73",
                "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

p4 + scale_color_manual(values = cb_palette) 
## `geom_smooth()` using formula 'y ~ x'

The ability to manually specify colors can be useful when the meaning of a category itself has a strong color association. For example, blue and red for political parties in U.S. What else?

Layer color and text

Let’s work through an example where we use manually specified colors both for emphasis and because of their social meaning. We will build up a plot of data about the 2016 US general election used in the last lecture.

setwd("~/Documents/ibs_course/BUS240/data")
load("county_data.rda")
party_colors <- c("#2E74C0", "#CB454A") # Democrat Blue and Republican Red

Note this variables:

# id. FIPS State and County code (character)
# pop. Population, 2014 estimate
# flipped Did the area flip parties from 2012 to 2016
# black. Black alone, percent, 2013
county_data %>% select(id, pop, flipped, black) %>% sample_n(8)
##      id     pop flipped black
## 1 55035  101564      No   1.0
## 2 30025    3108      No   0.2
## 3 12057 1316298      No  17.4
## 4 19005   14038     Yes   1.5
## 5 30093   34680      No   0.4
## 6 39141   77159      No   6.2
## 7 24013  167830      No   3.5
## 8 55085   35563      No   0.5
p0 <- ggplot(data = subset(county_data,
                           flipped == "No"),
             mapping = aes(x = pop,
                           y = black/100))
p0 + geom_point(color = "gray50") 

p0 + geom_point(alpha = 0.15, color = "gray50") 

p0 + geom_point(alpha = 0.15, color = "gray50") + scale_x_log10(labels=scales::comma) 

Now consider

#partywinner16. Winning party, 2016 Presidental Election.
county_data %>% select(id, partywinner16) %>% sample_n(8)
##      id partywinner16
## 1 16033    Republican
## 2 46107    Republican
## 3 48017    Republican
## 4 19167    Republican
## 5 47057    Republican
## 6 31141    Republican
## 7 22045    Republican
## 8 37165      Democrat
# save previous results in p1
p1 <- p0 + geom_point(alpha = 0.15, color = "gray50") + scale_x_log10(labels=scales::comma) 
p2 <- p1 + geom_point(data = subset(county_data,
                                    flipped == "Yes"),
                      mapping = aes(x = pop, y = black/100,
                                    color = partywinner16)) +
  scale_color_manual(values = party_colors)
p2

Clean up the labels

p3 <- p2 + scale_y_continuous(labels=scales::percent) +
    labs(color = "County flipped to ... ",
         x = "County Population (log scale)",
         y = "Percent Black Population",
         title = "Flipped counties, 2016",
         caption = "Counties in gray did not flip.")

p3

Last, add a third layer using geom_text_repel() function. Once again we supply a set of instructions to subset the data for this text layer. We are interested in the flipped counties that have with a relatively high percentage of African-American residents.

p4 <- p3 + geom_text_repel(data = subset(county_data,
                                      flipped == "Yes" &
                                      black  > 25),
                           mapping = aes(x = pop,
                                   y = black/100,
                                   label = state), size = 2)
p4
## Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

p4 + theme_minimal() + theme(legend.position="top")

Consider how much information we use from data to picture this.

Theme

Need more style?

theme_set(theme_bw())
p4 + theme(legend.position="top")

theme_set(theme_dark())
p4 + theme(legend.position="top")

Professional touch (not artistic)? Use ggthemes.

library(ggthemes)
theme_set(theme_economist())
p4 + theme(legend.position="top")
## Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

theme_set(theme_wsj())

p4 + theme(plot.title = element_text(size = rel(0.6)),
           legend.title = element_text(size = rel(0.35)),
           plot.caption = element_text(size = rel(0.35)),
           legend.position = "top")

9.3 Others

Y axes on both sides

library(cowplot)
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggthemes':
## 
##     theme_map
theme_set(theme_minimal())
setwd("~/Documents/ibs_course/BUS240/data")
load("fredts.rda")
head(fredts)
##         date  sp500 monbase  sp500_i monbase_i
## 1 2009-03-11 696.68 1542228 100.0000  100.0000
## 2 2009-03-18 766.73 1693133 110.0548  109.7849
## 3 2009-03-25 799.10 1693133 114.7012  109.7849
## 4 2009-04-01 809.06 1733017 116.1308  112.3710
## 5 2009-04-08 830.61 1733017 119.2240  112.3710
## 6 2009-04-15 852.21 1789878 122.3245  116.0579
fredts_m <- fredts %>% select(date, sp500_i, monbase_i) %>%
    gather(key = series, value = score, sp500_i:monbase_i)

head(fredts_m)
##         date  series    score
## 1 2009-03-11 sp500_i 100.0000
## 2 2009-03-18 sp500_i 110.0548
## 3 2009-03-25 sp500_i 114.7012
## 4 2009-04-01 sp500_i 116.1308
## 5 2009-04-08 sp500_i 119.2240
## 6 2009-04-15 sp500_i 122.3245
p <- ggplot(data = fredts_m,
            mapping = aes(x = date, y = score,
                          group = series,
                          color = series))
p1 <- p + geom_line() + theme(legend.position = "top") +
    labs(x = "Date",
         y = "Index",
         color = "Series")
p1
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.

p <- ggplot(data = fredts,
            mapping = aes(x = date, y = sp500_i - monbase_i))
p2 <- p + geom_line() +
    labs(x = "Date",
         y = "Difference")
p2
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.

cowplot::plot_grid(p1, p2, nrow = 2, rel_heights = c(0.75, 0.25), align = "v")
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.